infercnv results exploration
for SCPCS000208subdiagnosis <- readr::read_tsv(
file.path("..", "..", "..", "data", "current", params$scpca_project_id, "single_cell_metadata.tsv"),
show_col_types = FALSE
) |>
dplyr::filter(scpca_sample_id == params$sample_id) |>
dplyr::pull(subdiagnosis)This notebook explores using infercnv
to estimate tumor and normal cells in SCPCS000208 from SCPCP000006. This
sample has a(n) Anaplastic subdiagnosis.
infercnv was run using the 06_inferCNV.R
script with and without a normal reference. We also tested the impact of
the subselection of normal cells using either immune, and/or endothelial
cells as healthy reference.
In this notebook, we just want to compare the heatmaps of CNV profiles, and evaluate how comparable are the methods and how sensible they are to key parameters such as selection of healthy reference.
Here we defined function that will be used multiple time all along the notebook.
For a Seurat object object, the function
Do_CNV_heatmap load the infercnv object
created with the script 06_infercnv.R using
reference_value as a reference and call the function
SCpubr::do_CopyNumberVariantPlot to plot the mean CNV score
in each group defined in group.by.
object is the Seurat object
infercnv_obj is the infercnv
object
group.by is the metadata used for grouping the
violin plots
Do_CNV_heatmap <- function(object, infercnv_obj, group.by){
chromosome_locations = SCpubr::human_chr_locations
out <- SCpubr::do_CopyNumberVariantPlot(sample = object,
infercnv_object = infercnv_obj,
using_metacells = FALSE,
chromosome_locations = chromosome_locations,
return_object = FALSE,
group.by = group.by
)
out <- out +
ggtitle(glue::glue("Copy Number Variant Plot, ", reference_value)) +
ylab(label = "")
return(out)
}For a Seurat object an infercnv object
created with the script 06_infercnv.R using
reference_value as a reference, the function
Do_CNV_score load the infercnv object and
calculate a CNV score per cell. The score is calculated based on the biostar discussion.
It
reference_value is the selection of normal cells used
for infercnvFor a Seurat object objectand a metadata
metadata, the function visualize_metadata will
plot FeaturePlot and BarPlot
object is the Seurat object
meta the gene or quantitative value to be
plotted
group.by is the metadata used for grouping the
violin plots
visualize_metadata <- function(object, meta, group.by){
if(is.numeric(object@meta.data[,meta])){
d <- SCpubr::do_FeaturePlot(object,
features = meta,
pt.size = 0.2,
legend.width = 0.5,
legend.length = 5,
legend.position = "right") + ggtitle(meta)
b <- SCpubr::do_ViolinPlot(object,
features = meta,
ncol = 1,
group.by = group.by,
legend.position = "none")
return(d + b + plot_layout(ncol = 2, widths = c(2,4)))
}
else{
d <- SCpubr::do_DimPlot(object, reduction="umap", group.by = group.by, label = TRUE, repel = TRUE) + ggtitle(paste0(meta," - umap")) + theme(text=element_text(size=18))
b <- SCpubr::do_BarPlot(sample = object,
group.by = meta,
split.by = group.by,
position = "fill",
font.size = 10,
legend.ncol = 3) +
ggtitle("% cells")+
xlab(print(group.by)) +
theme(text=element_text(size=18))
return(d + b + plot_layout(ncol = 2, widths = c(2,4)))
}
}For a Seurat object objectand a features
features, the function visualize_feature will
plot FeaturePlot and ViolinPlot
object is the Seurat object
features the gene or quantitative value to be
plotted
group.by is the metadata used for grouping the
violin plots
visualize_feature <- function(object, features, group.by ){
d <- SCpubr::do_FeaturePlot(object,
features = features,
pt.size = 0.2,
legend.width = 0.5,
legend.length = 5,
legend.position = "right") + ggtitle(features)
b <- SCpubr::do_ViolinPlot(object,
features = features,
ncol = 1,
group.by = group.by,
legend.position = "none",
assay = "SCT") + ylab(features)
return(d + b + plot_layout(ncol = 2, widths = c(2,4)))
}For a Seurat object objectand a features
features, the function visualize_feature will
plot FeaturePlot and ViolinPlot
object is the Seurat object
features the gene or quantitative value to be
plotted
group.by is the metadata used for grouping the
violin plots
The input for this notebook are the results of
06_inferCNV.R
Here we plot the output of infercnv as heatmaps of CNV.
We first look at the png file generated by the infercnv
function. We then used the infercnv object to look at mean
CNV value across compartments (immune, endothelial, stroma and fetal
nephron).
infercnv_obj <- list()
for(reference_value in c("reference-none", "reference-immune", "reference-endothelium", "reference-both")){
infercnv_obj[[reference_value]] <- readRDS(file.path(result_dir, glue::glue(reference_value, "_HMM-no") , glue::glue("06_infercnv_", params$sample_id, "_", reference_value, "_HMM-no.rds")))
print(Do_CNV_heatmap(object = srat, infercnv_obj = infercnv_obj[[reference_value]], group.by = "fetal_kidney_predicted.compartment"))
}for(reference_value in c("reference-none", "reference-immune", "reference-endothelium", "reference-both")){
srat <- Do_CNV_score(srat,infercnv_obj = infercnv_obj[[reference_value]], reference_value)
p1 <- visualize_feature(srat, glue::glue("CNV-score_", reference_value) , group.by = "fetal_kidney_predicted.compartment" )
p2 <- visualize_density(srat, features=glue::glue("CNV-score_", reference_value), group.by = "fetal_kidney_predicted.compartment")
print(p1 + p2 + plot_layout(ncol = 3, widths = c(1,2,2)))
}This unique CNV score do not look so promising. We might have to select chromosomes we would like to look at, i.e. the one relevant for Wilms tumor.
Seurat objectWe load the Seurat object generated in
06_infercnv.R
For each chromosome, we look at the repartition of the
proportion_cnv_ in cells labeled as immune, endothelial,
stroma and fetal nephron. proportion_cnv_ is the proportion
in number of genes that are part of any cnv/loss/duplication in the
given chr.
This should allow a quick visual check that immune and endothelial cells do not expressed any CNV. If so, we should check if genes driving the CNV score are not key immune or endothelium genes.
Also, we should visualize if one area of the umap do not have CNV and could be identify as additional normal cell subset.
for(i in 1:22){
print(visualize_feature(srat, features=glue::glue("proportion_cnv_chr", i), group.by = "fetal_kidney_predicted.compartment"))
}For each chromosome, we look at the distribution of the
proportion_cnv_ in cells labeled as immune, endothelial,
stroma and fetal nephron. proportion_cnv_ is the proportion
in number of genes that are part of any cnv/loss/duplication in the
given chr.
We are quite confident that immune and endothelial cells are well
identified by label transfer done in
02b_label-transfer_fetal_kidney_reference_Stewart.Rmd. The
distribution of CNV for endothelial and immune cells should thus be a
single peak center on 0.
We do not know if fetal nephron and stroma cells are a mix of normal and cancer cells. Would they be a group of normal cells, we should expect a single peak center on 0 for every chromosome. As we expect to have a large number of cancer with heterogeneous CNV, we should see multiple peaks.
for(i in 1:22){
print(visualize_density(srat, features=glue::glue("proportion_cnv_chr", i), group.by = "fetal_kidney_predicted.compartment"))
}Seurat objectWe load the Seurat object generated in
06_infercnv.R
For each chromosome, we look at the repartition of the
proportion_cnv_ in cells labeled as immune, endothelial,
stroma and fetal nephron. proportion_cnv_ is the proportion
in number of genes that are part of any cnv/loss/duplication in the
given chr.
This should allow a quick visual check that immune and endothelial cells do not expressed any CNV. If so, we should check if genes driving the CNV score are not key immune or endothelium genes.
Also, we should visualize if one area of the umap do not have CNV and could be identify as additional normal cell subset.
for(i in 1:22){
print(visualize_feature(srat, features=glue::glue("proportion_cnv_chr", i), group.by = "fetal_kidney_predicted.compartment"))
}For each chromosome, we look at the distribution of the
proportion_cnv_ in cells labeled as immune, endothelial,
stroma and fetal nephron. proportion_cnv_ is the proportion
in number of genes that are part of any cnv/loss/duplication in the
given chr.
We are quite confident that immune and endothelial cells are well
identified by label transfer done in
02b_label-transfer_fetal_kidney_reference_Stewart.Rmd. The
distribution of CNV for endothelial and immune cells should thus be a
single peak center on 0.
We do not know if fetal nephron and stroma cells are a mix of normal and cancer cells. Would they be a group of normal cells, we should expect a single peak center on 0 for every chromosome. As we expect to have a large number of cancer with heterogeneous CNV, we should see multiple peaks.
for(i in 1:22){
print(visualize_density(srat, features=glue::glue("proportion_cnv_chr", i), group.by = "fetal_kidney_predicted.compartment"))
}Providing no reference is not a good option, as we think that
most of the cells are cancer cells with few CNV. Without healthy
reference, infercnv take the mean of expression across the
cells as a reference. In case of events that are well shared by a
majority of the cells, they will be smoothed.
For some CNV and specific chromosome, the choice a the reference do not affect the result: ch18, ch17.
Some CNV values differs depending on the choice of the reference, for chr5 for example.
This is an issue if we want to have the exact CNV pattern.
There are three samples with a very low (<5) number of immune +
endothelial cells: SCPCS000197 (4 endothelial), SCPCS000190 (1 immune),
SCPCS000177 (2 immune) for which it might be problematic to run
infercnv.
I am not sure if some endothelial cells might be missclassified? To be careful when we look at % of alterations within a compartment, as there are sometimes very few immune and/or endothelial cells, some % can be easily high without being meaningful.
HMM-i3 model looks “cleaner” than the HMM-i6 model, with less CNV found in immune and endothelial cells.
# record the versions of the packages used in this analysis and other environment information
sessionInfo()## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Vienna
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] optparse_1.7.5 Seurat_5.1.0 SeuratObject_5.0.2 sp_2.1-4 patchwork_1.2.0 ggplot2_3.5.1 SCpubr_2.0.2 infercnv_1.20.0
##
## loaded via a namespace (and not attached):
## [1] fs_1.6.4 matrixStats_1.3.0 spatstat.sparse_3.1-0 bitops_1.0-8 httr_1.4.7 RColorBrewer_1.1-3
## [7] doParallel_1.0.17 tools_4.4.1 sctransform_0.4.1 utf8_1.2.4 R6_2.5.1 DT_0.33
## [13] lazyeval_0.2.2 uwot_0.2.2 ggdist_3.3.2 withr_3.0.1 gridExtra_2.3 parallelDist_0.2.6
## [19] progressr_0.14.0 argparse_2.2.3 cli_3.6.3 Biobase_2.64.0 formatR_1.14 spatstat.explore_3.3-2
## [25] fastDummies_1.7.4 sandwich_3.1-1 sass_0.4.9 labeling_0.4.3 mvtnorm_1.3-1 spatstat.data_3.1-2
## [31] readr_2.1.5 ggridges_0.5.6 pbapply_1.7-2 yulab.utils_0.1.7 parallelly_1.38.0 limma_3.60.4
## [37] rstudioapi_0.16.0 RSQLite_2.3.7 gridGraphics_0.5-1 generics_0.1.3 gtools_3.9.5 ica_1.0-3
## [43] spatstat.random_3.3-1 vroom_1.6.5 dplyr_1.1.4 distributional_0.5.0 Matrix_1.7-0 futile.logger_1.4.3
## [49] fansi_1.0.6 S4Vectors_0.42.1 abind_1.4-5 lifecycle_1.0.4 multcomp_1.4-26 yaml_2.3.10
## [55] edgeR_4.2.1 SummarizedExperiment_1.34.0 gplots_3.1.3.1 SparseArray_1.4.8 Rtsne_0.17 grid_4.4.1
## [61] blob_1.2.4 promises_1.3.0 crayon_1.5.3 miniUI_0.1.1.1 lattice_0.22-6 cowplot_1.1.3
## [67] KEGGREST_1.44.1 pillar_1.9.0 knitr_1.48 GenomicRanges_1.56.1 future.apply_1.11.2 codetools_0.2-20
## [73] leiden_0.4.3.1 glue_1.7.0 spatstat.univar_3.0-0 data.table_1.16.0 vctrs_0.6.5 png_0.1-8
## [79] spam_2.10-0 gtable_0.3.5 assertthat_0.2.1 cachem_1.1.0 xfun_0.47 S4Arrays_1.4.1
## [85] mime_0.12 libcoin_1.0-10 coda_0.19-4.1 survival_3.7-0 DElegate_1.2.1 SingleCellExperiment_1.26.0
## [91] iterators_1.0.14 statmod_1.5.0 fitdistrplus_1.2-1 TH.data_1.1-2 ROCR_1.0-11 nlme_3.1-166
## [97] bit64_4.0.5 RcppAnnoy_0.0.22 GenomeInfoDb_1.40.1 rprojroot_2.0.4 bslib_0.8.0 irlba_2.3.5.1
## [103] KernSmooth_2.23-24 colorspace_2.1-1 BiocGenerics_0.50.0 DBI_1.2.3 tidyselect_1.2.1 bit_4.0.5
## [109] compiler_4.4.1 DelayedArray_0.30.1 plotly_4.10.4 scales_1.3.0 caTools_1.18.2 lmtest_0.9-40
## [115] stringr_1.5.1 digest_0.6.37 goftest_1.2-3 spatstat.utils_3.1-0 rmarkdown_2.28 XVector_0.44.0
## [121] htmltools_0.5.8.1 pkgconfig_2.0.3 MatrixGenerics_1.16.0 highr_0.11 fastmap_1.2.0 rlang_1.1.4
## [127] htmlwidgets_1.6.4 UCSC.utils_1.0.0 shiny_1.9.1 jquerylib_0.1.4 farver_2.1.2 zoo_1.8-12
## [133] jsonlite_1.8.8 magrittr_2.0.3 ggplotify_0.1.2 modeltools_0.2-23 GenomeInfoDbData_1.2.12 dotCall64_1.1-1
## [139] munsell_0.5.1 Rcpp_1.0.13 ape_5.8 viridis_0.6.5 reticulate_1.38.0 stringi_1.8.4
## [145] zlibbioc_1.50.0 MASS_7.3-61 plyr_1.8.9 parallel_4.4.1 listenv_0.9.1 ggrepel_0.9.5
## [151] forcats_1.0.0 deldir_2.0-4 Biostrings_2.72.1 splines_4.4.1 tensor_1.5 hms_1.1.3
## [157] locfit_1.5-9.10 igraph_2.0.3 fastcluster_1.2.6 spatstat.geom_3.3-2 RcppHNSW_0.6.0 reshape2_1.4.4
## [163] stats4_4.4.1 futile.options_1.0.1 evaluate_0.24.0 RcppParallel_5.1.9 lambda.r_1.2.4 renv_1.0.7
## [169] BiocManager_1.30.25 tzdb_0.4.0 phyclust_0.1-34 foreach_1.5.2 httpuv_1.6.15 getopt_1.20.4
## [175] RANN_2.6.2 tidyr_1.3.1 purrr_1.0.2 polyclip_1.10-7 future_1.34.0 scattermore_1.2
## [181] coin_1.4-3 xtable_1.8-4 RSpectra_0.16-2 later_1.3.2 viridisLite_0.4.2 rjags_4-16
## [187] tibble_3.2.1 memoise_2.0.1 AnnotationDbi_1.66.0 IRanges_2.38.1 cluster_2.1.6 globals_0.16.3